package org.analyzer.ansj;

import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.IOException;
import java.io.Reader;
import java.io.BufferedReader;
import java.util.*;


public final class AnsjTokenizer extends Tokenizer{

	public  static final String SPACES = " 　\t\r\n";

	private static Set<String> stopwords = new HashSet<String>();
	final static String stop = " ',.`-_=?\'|\"(){}[]<>*#&^$@!~:;+/《》—－，。、：；！!·？“”）（【】［］●'";
	private int analysisType ; 
	private boolean removePunc;
	//for sentences split
	private final StringBuilder buffer = new StringBuilder();
	private int tokenStart = 0, tokenEnd = 0;


	private CharTermAttribute termAtt;
	private OffsetAttribute offsetAtt;
	private TypeAttribute typeAtt;
	private PositionIncrementAttribute positionIncrementAtt;
	int lastOffset = 0;
	int endPosition =0; 
	private Iterator<Term> tokenIter;
	private List<Term> tokenBuffer;
	static
	{
		for(String c : stop.split("")){
			stopwords.add(c);
		}
//		new ToAnalysis(new StringReader(""),new Forest[]);
	}

	public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) {
		super(input);
		offsetAtt = addAttribute(OffsetAttribute.class);
		termAtt = addAttribute(CharTermAttribute.class);
		typeAtt = addAttribute(TypeAttribute.class);
		positionIncrementAtt = addAttribute(PositionIncrementAttribute.class);
		this.analysisType = analysisType;
		this.removePunc = removePunc;
	}

	@Override
	public boolean incrementToken() throws IOException {
		if (tokenIter == null || !tokenIter.hasNext()){
			String currentSentence = checkSentences();
			if (currentSentence!= null){
				tokenBuffer = new ArrayList<Term>();
                List<Term> terms = null;
                if (analysisType == 1){
                    terms =  ToAnalysis.parse(currentSentence);
				} else if(analysisType == 2){
					terms = NlpAnalysis.parse(currentSentence);
				} else {
                    terms = IndexAnalysis.parse(currentSentence);
				}

                for(Term term:terms){
                    if (removePunc && stopwords.contains(term.getName()))
                        continue;
                    tokenBuffer.add(term);
                }
				tokenIter = tokenBuffer.iterator();
				if (!tokenIter.hasNext()){
					return false;
				}
			} else {
				return false; // no more sentences, end of stream!
			}
		}
		clearAttributes();

		Term term = tokenIter.next();
		if (removePunc){
			while(stopwords.contains(term.getName())){
				if (!tokenIter.hasNext()){
				}else{
					term = tokenIter.next();
				}
			}
		}
		termAtt.append(term.getName());
		termAtt.setLength(term.getName().length());

		int currentStart = tokenStart + term.getOffe();
		int currentEnd = tokenStart + term.getToValue();
		offsetAtt.setOffset(currentStart,currentEnd);
		typeAtt.setType("word");

//		int pi = currentStart - lastOffset;
//		if(term.getOffe()  <= 0) {
//			pi = 1;
//		}
//		positionIncrementAtt.setPositionIncrement( pi );
		lastOffset = currentStart;
		endPosition = currentEnd;
		return true;
	}

    private String checkSentences1() throws IOException{
        BufferedReader in = new BufferedReader(input);
        String line = null;
        while ((line = in.readLine()) != null){
            buffer.append(line);
        }
        return buffer.toString();
    }

	private String checkSentences() throws IOException{
		buffer.setLength(0);
		int ci;
		char ch, pch;
		boolean atBegin = true;
		tokenStart = tokenEnd;
		ci = input.read();
		ch = (char) ci;

		while (true) {
			if (ci == -1) {
				break;
//			} else if (PUNCTION.indexOf(ch) != -1) {
//				// End of a sentence
//				buffer.append(ch);
//				tokenEnd++;
//				break;
			} else if (atBegin && SPACES.indexOf(ch) != -1) {
				tokenStart++;
				tokenEnd++;
				ci = input.read();
				ch = (char) ci;
			} else {
				buffer.append(ch);
				atBegin = false;
				tokenEnd++;
				pch = ch;
				ci = input.read();
				ch = (char) ci;
				// Two spaces, such as CR, LF
				if (SPACES.indexOf(ch) != -1
						&& SPACES.indexOf(pch) != -1) {
					// buffer.append(ch);
					tokenEnd++;
					break;
				}
			}
		}
		if (buffer.length() == 0){
			//sentences finished~
			return null;
		}else {
			return buffer.toString();
		}

	}

	@Override
	public void reset() throws IOException {
		super.reset();
		tokenStart = tokenEnd = 0;
	}

	public final void end() {
		// set final offset
		int finalOffset = correctOffset(this.endPosition);
		offsetAtt.setOffset(finalOffset, finalOffset);
	}

}